import pandas as pd
import numpy as np
import matplotlib as plt
import os
os.getcwd()
train = pd.read_csv("/Users/piyushmishra/Downloads/Problems/Energy/train.csv")
train.head(10)
train.describe()
train.shape
train.apply(lambda x: sum(x.isnull()),axis=0)
import sklearn
import shap
from sklearn.model_selection import train_test_split
y = train.Energy
x = train.drop('Energy', axis=1)
seed = 1
#spliting the dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state= seed)
# create training and testing vars
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
# fit a model
from sklearn.ensemble import RandomForestRegressor
rf= RandomForestRegressor()
model = rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
predictions[0:5]
print(rf.score(X_test, y_test))
# print the JS visualization code to the notebook
shap.initjs()
shap_values = shap.TreeExplainer(model).shap_values(X_test)
# visualize the first prediction's explanation
shap.force_plot(shap_values[0,:], X_test.iloc[0,:])
# visualize the training set predictions
shap.force_plot(shap_values, X_test)
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("RH_7", shap_values, X_test)
# summarize the effects of all the features
shap.summary_plot(shap_values, X_test)